Skip to content

Commit e74077d

Browse files
committed
[Data Liberation] Support sourcing Attachments from non-local filesystems
Adds support for reading the attachments from a WP_Filesystem instance instead of just the local drive in WP_Attachment_Downloader. This unlocks importing content from, say, a zip file, and still getting all the attachments correctly.
1 parent de7ff45 commit e74077d

File tree

2 files changed

+95
-52
lines changed

2 files changed

+95
-52
lines changed

packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,17 @@ class WP_Attachment_Downloader {
88
private $fps = array();
99
private $output_root;
1010
private $output_paths = array();
11+
private $source_from_filesystem;
1112

1213
private $current_event;
1314
private $pending_events = array();
1415
private $enqueued_url;
1516
private $progress = array();
1617

17-
public function __construct( $output_root ) {
18-
$this->client = new Client();
19-
$this->output_root = $output_root;
18+
public function __construct( $output_root, $options = array() ) {
19+
$this->client = new Client();
20+
$this->output_root = $output_root;
21+
$this->source_from_filesystem = $options['source_from_filesystem'] ?? null;
2022
}
2123

2224
public function get_progress() {
@@ -37,24 +39,24 @@ public function has_pending_requests() {
3739
}
3840

3941
public function enqueue_if_not_exists( $url, $output_relative_path ) {
40-
$this->enqueued_url = $url;
41-
$output_relative_path = $this->output_root . '/' . ltrim( $output_relative_path, '/' );
42-
if ( file_exists( $output_relative_path ) ) {
42+
$this->enqueued_url = $url;
43+
$output_path = wp_join_paths( $this->output_root, $output_relative_path );
44+
if ( file_exists( $output_path ) ) {
4345
$this->pending_events[] = new WP_Attachment_Downloader_Event(
4446
$this->enqueued_url,
4547
WP_Attachment_Downloader_Event::ALREADY_EXISTS
4648
);
4749
return true;
4850
}
49-
if ( file_exists( $output_relative_path . '.partial' ) ) {
51+
if ( file_exists( $output_path . '.partial' ) ) {
5052
$this->pending_events[] = new WP_Attachment_Downloader_Event(
5153
$this->enqueued_url,
5254
WP_Attachment_Downloader_Event::IN_PROGRESS
5355
);
5456
return true;
5557
}
5658

57-
$output_dir = dirname( $output_relative_path );
59+
$output_dir = dirname( $output_path );
5860
if ( ! file_exists( $output_dir ) ) {
5961
// @TODO: think through the chmod of the created directory.
6062
mkdir( $output_dir, 0777, true );
@@ -67,15 +69,32 @@ public function enqueue_if_not_exists( $url, $output_relative_path ) {
6769

6870
switch ( $protocol ) {
6971
case 'file':
70-
$local_path = parse_url( $url, PHP_URL_PATH );
71-
if ( false === $local_path ) {
72+
if ( ! $this->source_from_filesystem ) {
73+
_doing_it_wrong( __METHOD__, 'Cannot process file:// URLs without a source filesystem instance. Use the source_from_filesystem option to pass in a filesystem instance to WP_Attachment_Downloader.', '1.0' );
74+
return false;
75+
}
76+
$source_path = parse_url( $url, PHP_URL_PATH );
77+
if ( false === $source_path ) {
7278
return false;
7379
}
7480

7581
// Just copy the file over.
7682
// @TODO: think through the chmod of the created file.
83+
$success = $this->source_from_filesystem->open_read_stream( $source_path );
84+
if ( $success ) {
85+
$fp = fopen( $output_path, 'wb' );
86+
// @TODO: Filesystem instance error handling.
87+
while ( $this->source_from_filesystem->next_file_chunk() ) {
88+
$chunk = $this->source_from_filesystem->get_file_chunk();
89+
fwrite( $fp, $chunk );
90+
}
91+
$this->source_from_filesystem->close_read_stream();
92+
fclose( $fp );
93+
if ( $this->source_from_filesystem->get_last_error() ) {
94+
$success = false;
95+
}
96+
}
7797

78-
$success = copy( $local_path, $output_relative_path );
7998
$this->pending_events[] = $success
8099
? new WP_Attachment_Downloader_Event(
81100
$this->enqueued_url,
@@ -90,7 +109,7 @@ public function enqueue_if_not_exists( $url, $output_relative_path ) {
90109
case 'http':
91110
case 'https':
92111
// Create a placeholder file to indicate that the download is in progress.
93-
touch( $output_relative_path . '.partial' );
112+
touch( $output_path . '.partial' );
94113
$request = new Request( $url );
95114
$this->output_paths[ $request->id ] = $output_relative_path;
96115
$this->progress[ $this->enqueued_url ] = array(

packages/playground/data-liberation/src/import/WP_Stream_Importer.php

Lines changed: 64 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,9 @@ protected function index_next_entities( $count = 10000 ) {
418418
}
419419
}
420420
// @TODO: Consider using sha1 hashes to prevent huge URLs from blowing up the memory.
421-
$this->indexed_assets_urls[ $data['attachment_url'] ] = true;
421+
if ( isset( $data['attachment_url'] ) ) {
422+
$this->indexed_assets_urls[ $data['attachment_url'] ] = true;
423+
}
422424
} elseif ( isset( $data['post_content'] ) ) {
423425
$post = $data;
424426
$p = new WP_Block_Markup_Url_Processor( $post['post_content'], $this->source_site_url );
@@ -522,7 +524,10 @@ protected function frontload_next_entity() {
522524
if ( null === $this->next_stage ) {
523525
$this->entity_iterator->set_entities_iterator( $this->create_entity_iterator() );
524526
}
525-
$this->downloader = new WP_Attachment_Downloader( $this->options['uploads_path'] );
527+
$this->downloader = new WP_Attachment_Downloader(
528+
$this->options['uploads_path'],
529+
$this->options['attachment_downloader_options'] ?? array()
530+
);
526531
}
527532

528533
// Clear the frontloading events from the previous pass.
@@ -585,7 +590,12 @@ protected function frontload_next_entity() {
585590
break;
586591
case 'post':
587592
if ( isset( $data['post_type'] ) && $data['post_type'] === 'attachment' ) {
588-
$this->enqueue_attachment_download( $data['attachment_url'] );
593+
if ( isset( $data['attachment_url'] ) ) {
594+
$this->enqueue_attachment_download( $data['attachment_url'] );
595+
} else {
596+
// @TODO: Emit warning / error event
597+
_doing_it_wrong( __METHOD__, 'No attachment URL or file path found in the post entity.', '1.0' );
598+
}
589599
} elseif ( isset( $data['post_content'] ) ) {
590600
$post = $data;
591601
$p = new WP_Block_Markup_Url_Processor( $post['post_content'], $this->source_site_url );
@@ -596,7 +606,7 @@ protected function frontload_next_entity() {
596606
$this->enqueue_attachment_download(
597607
$p->get_raw_url(),
598608
array(
599-
'context_path' => $post['source_path'] ?? $post['slug'] ?? null,
609+
'context_path' => $post['local_file_path'] ?? $post['slug'] ?? null,
600610
)
601611
);
602612
}
@@ -644,52 +654,66 @@ protected function import_next_entity() {
644654
switch ( $entity->get_type() ) {
645655
case 'post':
646656
$data = $entity->get_data();
647-
foreach ( array( 'guid', 'post_content', 'post_excerpt' ) as $key ) {
648-
if ( ! isset( $data[ $key ] ) ) {
649-
continue;
657+
if ( isset( $data['post_type'] ) && $data['post_type'] === 'attachment' ) {
658+
if ( ! isset( $data['attachment_url'] ) ) {
659+
// @TODO: Emit warning / error event
660+
_doing_it_wrong( __METHOD__, 'No attachment URL or file path found in the post entity.', '1.0' );
661+
break;
650662
}
651-
$p = new WP_Block_Markup_Url_Processor( $data[ $key ], $this->source_site_url );
652-
while ( $p->next_url() ) {
653-
// Relative URLs are okay at this stage.
654-
if ( ! $p->get_raw_url() ) {
663+
$asset_filename = $this->new_asset_filename(
664+
$data['attachment_url'],
665+
$data['local_file_path'] ?? $data['slug'] ?? null
666+
);
667+
unset( $data['attachment_url'] );
668+
$data['local_file_path'] = $this->options['uploads_path'] . '/' . $asset_filename;
669+
} else {
670+
foreach ( array( 'guid', 'post_content', 'post_excerpt' ) as $key ) {
671+
if ( ! isset( $data[ $key ] ) ) {
655672
continue;
656673
}
674+
$p = new WP_Block_Markup_Url_Processor( $data[ $key ], $this->source_site_url );
675+
while ( $p->next_url() ) {
676+
// Relative URLs are okay at this stage.
677+
if ( ! $p->get_raw_url() ) {
678+
continue;
679+
}
657680

658-
/**
659-
* Any URL that has a corresponding frontloaded file is an asset URL.
660-
*/
661-
$asset_filename = $this->new_asset_filename(
662-
$p->get_raw_url(),
663-
$data['source_path'] ?? $data['slug'] ?? null
664-
);
665-
if ( file_exists( $this->options['uploads_path'] . '/' . $asset_filename ) ) {
666-
$p->set_raw_url(
667-
$this->options['uploads_url'] . '/' . $asset_filename
668-
);
669681
/**
670-
* @TODO: How would we know a specific image block refers to a specific
671-
* attachment? We need to cross-correlate that to rewrite the URL.
672-
* The image block could have query parameters, too, but presumably the
673-
* path would be the same at least? What if the same file is referred
674-
* to by two different URLs? e.g. assets.site.com and site.com/assets/ ?
675-
* A few ideas: GUID, block attributes, fuzzy matching. Maybe a configurable
676-
* strategy? And the API consumer would make the decision?
682+
* Any URL that has a corresponding frontloaded file is an asset URL.
677683
*/
678-
continue;
679-
}
684+
$asset_filename = $this->new_asset_filename(
685+
$p->get_raw_url(),
686+
$data['local_file_path'] ?? $data['slug'] ?? null
687+
);
688+
if ( file_exists( $this->options['uploads_path'] . '/' . $asset_filename ) ) {
689+
$p->set_raw_url(
690+
$this->options['uploads_url'] . '/' . $asset_filename
691+
);
692+
/**
693+
* @TODO: How would we know a specific image block refers to a specific
694+
* attachment? We need to cross-correlate that to rewrite the URL.
695+
* The image block could have query parameters, too, but presumably the
696+
* path would be the same at least? What if the same file is referred
697+
* to by two different URLs? e.g. assets.site.com and site.com/assets/ ?
698+
* A few ideas: GUID, block attributes, fuzzy matching. Maybe a configurable
699+
* strategy? And the API consumer would make the decision?
700+
*/
701+
continue;
702+
}
680703

681-
// Absolute URLs are required at this stage.
682-
if ( ! $p->get_parsed_url() ) {
683-
continue;
684-
}
704+
// Absolute URLs are required at this stage.
705+
if ( ! $p->get_parsed_url() ) {
706+
continue;
707+
}
685708

686-
$target_base_url = $this->get_url_mapping_target( $p->get_parsed_url() );
687-
if ( false !== $target_base_url ) {
688-
$p->replace_base_url( $target_base_url );
689-
continue;
709+
$target_base_url = $this->get_url_mapping_target( $p->get_parsed_url() );
710+
if ( false !== $target_base_url ) {
711+
$p->replace_base_url( $target_base_url );
712+
continue;
713+
}
690714
}
715+
$data[ $key ] = $p->get_updated_html();
691716
}
692-
$data[ $key ] = $p->get_updated_html();
693717
}
694718
$entity->set_data( $data );
695719
break;

0 commit comments

Comments
 (0)